标签
爬取信息
字数
381 字
阅读时间
3 分钟
一、概述
二、使用示例
2.1 入门Demo
依赖
xml
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>代码实现 爬出实现
java
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.util.List;
public class LianjiaPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
Html html = page.getHtml();
// 房源详情连接
List<String> list = html.css(".content__list--item--title a").links().all();
page.addTargetRequests(list);
String title = html.xpath("//div[@class='content clear w1150']/p/text()").toString();
page.putField("title", title);
page.putField("rent", html.xpath("//p[@class='content__aside--title']/span/text()").toString());
page.putField("type", html.xpath("//p[@class='content__article__table']/allText()").toString());
page.putField("info", html.xpath("//div[@class='content__article__info']/allText()").toString());
page.putField("img", html.xpath("//div[@class='content__article__slide__item']/img").toString());
if(page.getResultItems().get("title") == null){
page.setSkip(true);
// 分页连接
for (int i = 1; i <= 100; i++) {
page.addTargetRequest("https://sh.lianjia.com/zufang/pg"+i);
}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new LianjiaPageProcessor())
.addUrl("https://sh.lianjia.com/zufang/")
.addPipeline(new MyPipeline())
.thread(1).run();
}
}爬取数据处理
java
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
public class MyPipeline implements Pipeline {
private static final ObjectMapper MAPPER = new ObjectMapper();
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = new HashMap<>();
data.put("url", resultItems.getRequest().getUrl());
data.put("title", resultItems.get("title"));//标题
data.put("rent", resultItems.get("rent"));//租金
String[] types = StringUtils.split(resultItems.get("type"), ' ');
data.put("rentMethod", types[0]);//租赁方式
data.put("houseType", types[1]);//户型,如:2室1厅1卫
data.put("orientation", types[2]);//朝向
String[] infos = StringUtils.split(resultItems.get("info"), ' ');
for (String info : infos) {
if (StringUtils.startsWith(info, "看房:")) {
data.put("time", StringUtils.split(info, ':')[1]);
} else if (StringUtils.startsWith(info, "楼层:")) {
data.put("floor", StringUtils.split(info, ':')[1]);
}
}
String imageUrl = StringUtils.split(resultItems.get("img"), '"')[3];
String newName = StringUtils
.substringBefore(StringUtils
.substringAfterLast(resultItems.getRequest().getUrl(),
"/"), ".") + ".jpg";
try {
this.downloadFile(imageUrl, new File("F:\\code\\images\\" + newName));
data.put("image", newName);
String json = MAPPER.writeValueAsString(data);
FileUtils.write(new File("F:\\code\\data.json"), json + "\n", "UTF-8",
true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 下载文件
*
* @param url 文件url
* @param dest 目标目录
* @throws Exception
*/
public void downloadFile (String url, File dest) throws Exception {
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response =
HttpClientBuilder.create().build().execute(httpGet);
try {
FileUtils.writeByteArrayToFile(dest,
IOUtils.toByteArray(response.getEntity().getContent()));
} finally {
response.close();
}
}
}